PyPI - doctra - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

doctra 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

doctra/engines/vlm/service.py +0 -12
doctra/parsers/structured_pdf_parser.py +11 -60
doctra/parsers/table_chart_extractor.py +8 -44
doctra/ui/app.py +5 -32
doctra/utils/progress.py +13 -69
doctra/utils/structured_utils.py +45 -49
doctra/version.py +1 -1
{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/METADATA +1 -1
{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/RECORD +12 -12
{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/WHEEL +0 -0
{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/licenses/LICENSE +0 -0
{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/top_level.txt +0 -0

doctra/engines/vlm/service.py CHANGED Viewed

@@ -19,7 +19,6 @@ class VLMStructuredExtractor:
         chart = vlm.extract_chart("/abs/path/chart.jpg")
         table = vlm.extract_table("/abs/path/table.jpg")
-        # Or with Anthropic:
         vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY")
     """
@@ -32,8 +31,6 @@ class VLMStructuredExtractor:
     ):
         """
         Initialize the VLMStructuredExtractor with provider configuration.
-        Sets up the VLM model for structured data extraction from images.
         :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
         :param vlm_model: Model name to use (defaults to provider-specific defaults)
@@ -60,8 +57,6 @@ class VLMStructuredExtractor:
         :raises Exception: If image processing or VLM call fails
         """
         try:
-            # Normalize path and verify readability
-            # (get_image_from_local already absolutizes & raises if missing)
             img = get_image_from_local(image_path)
             if img.mode != "RGB":
                 img = img.convert("RGB")
@@ -71,15 +66,11 @@ class VLMStructuredExtractor:
             return result
         except Exception as e:
-            # Re-raise so caller can handle/log too
             raise
     def extract_chart(self, image_path: str) -> Chart:
         """
         Extract structured chart data from an image.
-        Uses VLM to analyze a chart image and extract the data in a structured
-        format with title, headers, and rows.
         :param image_path: Path to the chart image file
         :return: Chart object containing extracted title, headers, and data rows
@@ -96,9 +87,6 @@ class VLMStructuredExtractor:
     def extract_table(self, image_path: str) -> Table:
         """
         Extract structured table data from an image.
-        Uses VLM to analyze a table image and extract the data in a structured
-        format with title, headers, and rows.
         :param image_path: Path to the table image file
         :return: Table object containing extracted title, headers, and data rows

doctra/parsers/structured_pdf_parser.py CHANGED Viewed

@@ -64,22 +64,19 @@ class StructuredPDFParser:
     ):
         """
         Initialize the StructuredPDFParser with processing configuration.
-        Sets up the layout detection engine, OCR engine, and optionally
-        the VLM service for comprehensive document processing.
-        :param use_vlm: Whether to use VLM for structured data extraction
-        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
+        :param use_vlm: Whether to use VLM for structured data extraction (default: False)
+        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
         :param vlm_model: Model name to use (defaults to provider-specific defaults)
-        :param vlm_api_key: API key for VLM provider
-        :param layout_model_name: Layout detection model name
-        :param dpi: DPI for PDF rendering
-        :param min_score: Minimum confidence score for layout detection
-        :param ocr_lang: OCR language code
-        :param ocr_psm: Tesseract page segmentation mode
-        :param ocr_oem: Tesseract OCR engine mode
-        :param ocr_extra_config: Additional Tesseract configuration
-        :param box_separator: Separator between text boxes in output
+        :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
+        :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
+        :param dpi: DPI for PDF rendering (default: 200)
+        :param min_score: Minimum confidence score for layout detection (default: 0.0)
+        :param ocr_lang: OCR language code (default: "eng")
+        :param ocr_psm: Tesseract page segmentation mode (default: 4)
+        :param ocr_oem: Tesseract OCR engine mode (default: 3)
+        :param ocr_extra_config: Additional Tesseract configuration (default: "")
+        :param box_separator: Separator between text boxes in output (default: "\n")
         """
         self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
         self.dpi = dpi
@@ -100,15 +97,10 @@ class StructuredPDFParser:
     def parse(self, pdf_path: str) -> None:
         """
         Parse a PDF document and extract all content types.
-        Processes the PDF through layout detection, extracts text using OCR,
-        saves images for visual elements, and optionally converts charts/tables
-        to structured data using VLM.
         :param pdf_path: Path to the input PDF file
         :return: None
         """
-        # Extract filename without extension and create output directory
         pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
         out_dir = f"outputs/{pdf_filename}/full_parse"
@@ -120,7 +112,6 @@ class StructuredPDFParser:
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Count for progress bars
         fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
@@ -133,11 +124,8 @@ class StructuredPDFParser:
         figures_desc = "Figures (cropped)"
         with ExitStack() as stack:
-            # Enhanced environment detection
             is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
             is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
-            # Use appropriate progress bars based on environment
             if is_notebook:
                 charts_bar = stack.enter_context(
                     create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -165,13 +153,11 @@ class StructuredPDFParser:
                         rel = os.path.relpath(abs_img_path, out_dir)
                         if box.label == "figure":
-                            # Figures are always images in MD
                             md_lines.append(f"![Figure — page {page_num}]({rel})\n")
                             if figures_bar: figures_bar.update(1)
                         elif box.label == "chart":
                             if self.use_vlm and self.vlm:
-                                # Try structured → Markdown table; fallback to image if it fails
                                 wrote_table = False
                                 try:
                                     chart = self.vlm.extract_chart(abs_img_path)
@@ -193,7 +179,6 @@ class StructuredPDFParser:
                         elif box.label == "table":
                             if self.use_vlm and self.vlm:
-                                # Try structured → Markdown table; fallback to image if it fails
                                 wrote_table = False
                                 try:
                                     table = self.vlm.extract_table(abs_img_path)
@@ -229,7 +214,6 @@ class StructuredPDFParser:
             html_structured_path = os.path.join(out_dir, "tables.html")
             write_structured_html(html_structured_path, structured_items)
-        # Print completion message with output directory
         print(f"✅ Parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")
@@ -249,30 +233,25 @@ class StructuredPDFParser:
         :param save_path: Optional path to save the visualization (if None, displays only)
         :return: None
         """
-        # Get layout predictions
         pages: List[LayoutPage] = self.layout_engine.predict_pdf(
             pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Limit to requested number of pages
         pages_to_show = min(num_pages, len(pages))
         if pages_to_show == 0:
             print("No pages to display")
             return
-        # Calculate grid dimensions
         rows = (pages_to_show + cols - 1) // cols
-        # Collect unique labels from the processed pages and assign colors
         used_labels = set()
         for idx in range(pages_to_show):
             page = pages[idx]
             for box in page.boxes:
                 used_labels.add(box.label.lower())
-        # Create dynamic color assignment for all detected labels
         base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
                        '#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
                        '#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
@@ -281,22 +260,18 @@ class StructuredPDFParser:
         for i, label in enumerate(sorted(used_labels)):
             dynamic_label_colors[label] = base_colors[i % len(base_colors)]
-        # Process each page and add bounding boxes
         processed_pages = []
         for idx in range(pages_to_show):
             page = pages[idx]
             page_img = pil_pages[idx].copy()
-            # Calculate scale factor to resize to target width
             scale_factor = page_width / page_img.width
             new_height = int(page_img.height * scale_factor)
             page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
-            # Create drawing context
             draw = ImageDraw.Draw(page_img)
-            # Try to load a nice font, fallback to default
             try:
                 font = ImageFont.truetype("arial.ttf", 24)
                 small_font = ImageFont.truetype("arial.ttf", 18)
@@ -308,21 +283,16 @@ class StructuredPDFParser:
                     font = None
                     small_font = None
-            # Draw bounding boxes
             for box in page.boxes:
-                # Scale coordinates
                 x1 = int(box.x1 * scale_factor)
                 y1 = int(box.y1 * scale_factor)
                 x2 = int(box.x2 * scale_factor)
                 y2 = int(box.y2 * scale_factor)
-                # Get color for this label from dynamic assignment
                 color = dynamic_label_colors.get(box.label.lower(), '#000000')
-                # Draw rectangle with rounded corners effect
                 draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
-                # Draw label background
                 label_text = f"{box.label} ({box.score:.2f})"
                 if font:
                     bbox = draw.textbbox((0, 0), label_text, font=small_font)
@@ -332,11 +302,9 @@ class StructuredPDFParser:
                     text_width = len(label_text) * 8
                     text_height = 15
-                # Position label above the box
                 label_x = x1
                 label_y = max(0, y1 - text_height - 8)
-                # Draw label background with padding
                 padding = 4
                 draw.rectangle([
                     label_x - padding,
@@ -345,10 +313,8 @@ class StructuredPDFParser:
                     label_y + text_height + padding
                 ], fill='white', outline=color, width=2)
-                # Draw label text
                 draw.text((label_x, label_y), label_text, fill=color, font=small_font)
-            # Add page title
             title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
             if font:
                 title_bbox = draw.textbbox((0, 0), title_text, font=font)
@@ -356,7 +322,6 @@ class StructuredPDFParser:
             else:
                 title_width = len(title_text) * 12
-            # Draw title background
             title_x = (page_width - title_width) // 2
             title_y = 10
             draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
@@ -365,16 +330,13 @@ class StructuredPDFParser:
             processed_pages.append(page_img)
-        # Create grid layout with space for legend
         legend_width = 250
         grid_width = cols * page_width + (cols - 1) * spacing
         total_width = grid_width + legend_width + spacing
         grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
-        # Create final grid image with modern background
         final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
-        # Place pages in grid
         for idx, page_img in enumerate(processed_pages):
             row = idx // cols
             col = idx % cols
@@ -384,13 +346,11 @@ class StructuredPDFParser:
             final_img.paste(page_img, (x_pos, y_pos))
-        # Create legend
         legend_x = grid_width + spacing
         legend_y = 20
         draw_legend = ImageDraw.Draw(final_img)
-        # Legend title
         legend_title = "Element Types"
         if font:
             title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
@@ -400,47 +360,38 @@ class StructuredPDFParser:
             title_width = len(legend_title) * 12
             title_height = 20
-        # Draw legend background
         legend_bg_height = len(used_labels) * 35 + title_height + 40
         draw_legend.rectangle([legend_x - 10, legend_y - 10,
                                legend_x + legend_width - 10, legend_y + legend_bg_height],
                               fill='white', outline='#E5E7EB', width=2)
-        # Draw legend title
         draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
                          fill='#1F2937', font=font)
-        # Draw legend items - now using dynamic colors for actually detected labels
         current_y = legend_y + title_height + 20
         for label in sorted(used_labels):
             color = dynamic_label_colors[label]
-            # Draw color square
             square_size = 20
             draw_legend.rectangle([legend_x + 10, current_y,
                                    legend_x + 10 + square_size, current_y + square_size],
                                   fill=color, outline='#6B7280', width=1)
-            # Draw label text
             draw_legend.text((legend_x + 40, current_y + 2), label.title(),
                              fill='#374151', font=small_font)
             current_y += 30
-        # Save or display
         if save_path:
             final_img.save(save_path, quality=95, optimize=True)
             print(f"Layout visualization saved to: {save_path}")
         else:
-            # Display using PIL's default viewer
             final_img.show()
-        # Print summary statistics
         print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
         print(f"Pages processed: {pages_to_show}")
-        # Create summary by label across all pages
         total_counts = {}
         for idx in range(pages_to_show):
             page = pages[idx]

doctra/parsers/table_chart_extractor.py CHANGED Viewed

@@ -61,22 +61,17 @@ class ChartTablePDFParser:
     ):
         """
         Initialize the ChartTablePDFParser with extraction configuration.
-        Sets up the layout detection engine and optionally the VLM service
-        for structured data extraction.
-        :param extract_charts: Whether to extract charts from the document
-        :param extract_tables: Whether to extract tables from the document
-        :param use_vlm: Whether to use VLM for structured data extraction
-        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
+        :param extract_charts: Whether to extract charts from the document (default: True)
+        :param extract_tables: Whether to extract tables from the document (default: True)
+        :param use_vlm: Whether to use VLM for structured data extraction (default: False)
+        :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
         :param vlm_model: Model name to use (defaults to provider-specific defaults)
-        :param vlm_api_key: API key for VLM provider
-        :param layout_model_name: Layout detection model name
-        :param dpi: DPI for PDF rendering
-        :param min_score: Minimum confidence score for layout detection
-        :raises ValueError: If neither extract_charts nor extract_tables is True
+        :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
+        :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
+        :param dpi: DPI for PDF rendering (default: 200)
+        :param min_score: Minimum confidence score for layout detection (default: 0.0)
         """
-        # Validation
         if not extract_charts and not extract_tables:
             raise ValueError("At least one of extract_charts or extract_tables must be True")
@@ -98,21 +93,15 @@ class ChartTablePDFParser:
     def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
         """
         Parse a PDF document and extract charts and/or tables.
-        Processes the PDF through layout detection, extracts the specified
-        element types, saves cropped images, and optionally converts them
-        to structured data using VLM.
         :param pdf_path: Path to the input PDF file
         :param output_base_dir: Base directory for output files (default: "outputs")
         :return: None
         """
-        # Create output directory structure: outputs/<filename>/structured_parsing/
         pdf_name = Path(pdf_path).stem
         out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
         os.makedirs(out_dir, exist_ok=True)
-        # Create subdirectories based on what we're extracting
         charts_dir = None
         tables_dir = None
@@ -129,24 +118,20 @@ class ChartTablePDFParser:
         )
         pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
-        # Determine which labels to extract
         target_labels = []
         if self.extract_charts:
             target_labels.append("chart")
         if self.extract_tables:
             target_labels.append("table")
-        # Count items for progress bars
         chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
         table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
-        # Prepare output content
         if self.use_vlm:
             md_lines: List[str] = ["# Extracted Charts and Tables\n"]
             structured_items: List[Dict[str, Any]] = []
             vlm_items: List[Dict[str, Any]] = []
-        # Progress bar descriptions
         charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
         tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
@@ -154,11 +139,9 @@ class ChartTablePDFParser:
         table_counter = 1
         with ExitStack() as stack:
-            # Enhanced environment detection
             is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
             is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
-            # Use appropriate progress bars based on environment
             if is_notebook:
                 charts_bar = stack.enter_context(
                     create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -174,23 +157,19 @@ class ChartTablePDFParser:
                 page_num = p.page_index
                 page_img: Image.Image = pil_pages[page_num - 1]
-                # Only process selected item types
                 target_items = [box for box in p.boxes if box.label in target_labels]
                 if target_items and self.use_vlm:
                     md_lines.append(f"\n## Page {page_num}\n")
                 for box in sorted(target_items, key=reading_order_key):
-                    # Handle charts
                     if box.label == "chart" and self.extract_charts:
                         chart_filename = f"chart_{chart_counter:03d}.png"
                         chart_path = os.path.join(charts_dir, chart_filename)
-                        # Save image
                         cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
                         cropped_img.save(chart_path)
-                        # Handle VLM processing if enabled
                         if self.use_vlm and self.vlm:
                             rel_path = os.path.join("charts", chart_filename)
                             wrote_table = False
@@ -227,16 +206,13 @@ class ChartTablePDFParser:
                         if charts_bar:
                             charts_bar.update(1)
-                    # Handle tables
                     elif box.label == "table" and self.extract_tables:
                         table_filename = f"table_{table_counter:03d}.png"
                         table_path = os.path.join(tables_dir, table_filename)
-                        # Save image
                         cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
                         cropped_img.save(table_path)
-                        # Handle VLM processing if enabled
                         if self.use_vlm and self.vlm:
                             rel_path = os.path.join("tables", table_filename)
                             wrote_table = False
@@ -273,19 +249,11 @@ class ChartTablePDFParser:
                         if tables_bar:
                             tables_bar.update(1)
-        # Write outputs only if VLM is used
-        md_path = None
         excel_path = None
         if self.use_vlm:
-            # Write markdown file
-            md_path = os.path.join(out_dir, "charts.md")
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write('\n'.join(md_lines))
-            # Write Excel file if we have structured data
             if structured_items:
-                # Determine Excel filename based on extraction target
                 if self.extract_charts and self.extract_tables:
                     excel_filename = "parsed_tables_charts.xlsx"
                 elif self.extract_charts:
@@ -299,23 +267,19 @@ class ChartTablePDFParser:
                 excel_path = os.path.join(out_dir, excel_filename)
                 write_structured_excel(excel_path, structured_items)
-                # Also create HTML version
                 html_filename = excel_filename.replace('.xlsx', '.html')
                 html_path = os.path.join(out_dir, html_filename)
                 write_structured_html(html_path, structured_items)
-            # Write VLM items mapping for UI linkage
             if 'vlm_items' in locals() and vlm_items:
                 with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
                     json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
-        # Print results
         extraction_types = []
         if self.extract_charts:
             extraction_types.append("charts")
         if self.extract_tables:
             extraction_types.append("tables")
-        # Print completion message with output directory
         print(f"✅ Parsing completed successfully!")
         print(f"📁 Output directory: {out_dir}")

doctra/ui/app.py CHANGED Viewed

@@ -17,13 +17,10 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
     if out_dir.exists():
         if is_structured_parsing:
-            # For structured parsing, show ALL files in the directory
             for file_path in sorted(out_dir.rglob("*")):
                 if file_path.is_file():
                     file_paths.append(str(file_path))
         else:
-            # For full parsing, use the original logic
-            # Always add main output files (HTML, Markdown, etc.) regardless of allowed_kinds
             main_files = [
                 "result.html",
                 "result.md",
@@ -36,22 +33,18 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
                 if file_path.exists():
                     file_paths.append(str(file_path))
-            # Add image files based on allowed_kinds or all images if not specified
             if allowed_kinds:
                 for kind in allowed_kinds:
-                    # ChartTablePDFParser saves directly to charts/ and tables/ directories
                     p = out_dir / kind
                     if p.exists():
-                        for img in sorted(p.glob("*.png")):  # ChartTablePDFParser saves as .png
+                        for img in sorted(p.glob("*.png")):
                             file_paths.append(str(img))
-                    # Also check images/ subdirectories (for StructuredPDFParser)
                     images_dir = out_dir / "images" / kind
                     if images_dir.exists():
-                        for img in sorted(images_dir.glob("*.jpg")):  # StructuredPDFParser saves as .jpg
+                        for img in sorted(images_dir.glob("*.jpg")):
                             file_paths.append(str(img))
             else:
-                # Fallback: look in both direct directories and images/ subdirectories
                 for p in (out_dir / "charts").glob("*.png"):
                     file_paths.append(str(p))
                 for p in (out_dir / "tables").glob("*.png"):
@@ -59,7 +52,6 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
                 for p in (out_dir / "images").rglob("*.jpg"):
                     file_paths.append(str(p))
-            # Add Excel files based on extraction target (for structured parsing)
             if allowed_kinds:
                 if "charts" in allowed_kinds and "tables" in allowed_kinds:
                     excel_files = ["parsed_tables_charts.xlsx"]
@@ -77,30 +69,24 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
     kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
     for sub in kinds:
-        # Look in both direct directories and images/ subdirectories
-        # First try direct directories (for ChartTablePDFParser)
         p = out_dir / sub
         if p.exists():
-            for img in sorted(p.glob("*.png")):  # ChartTablePDFParser saves as .png
+            for img in sorted(p.glob("*.png")):
                 gallery_items.append((str(img), f"{sub}: {img.name}"))
-        # Also try images/ subdirectories (for StructuredPDFParser)
         images_dir = out_dir / "images" / sub
         if images_dir.exists():
-            for img in sorted(images_dir.glob("*.jpg")):  # StructuredPDFParser saves as .jpg
+            for img in sorted(images_dir.glob("*.jpg")):
                 gallery_items.append((str(img), f"{sub}: {img.name}"))
     tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
-    # Use custom filename if provided, otherwise use default
     if zip_filename:
-        # Clean the filename to be safe for file systems
         safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
         zip_base = tmp_zip_dir / safe_filename
     else:
         zip_base = tmp_zip_dir / "doctra_outputs"
-    # Create a filtered copy of the output directory excluding temp files
     filtered_dir = tmp_zip_dir / "filtered_outputs"
     shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
@@ -125,13 +111,10 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
     while i < len(lines):
         line = lines[i].strip()
-        # Check for page header
         if line.startswith('## Page '):
-            # Save previous page if exists
             if current_page:
                 pages.append(current_page)
-            # Start new page
             page_num = line.replace('## Page ', '').strip()
             current_page = {
                 'page_num': page_num,
@@ -145,15 +128,12 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
             i += 1
             continue
-        # Check for images (tables, charts, figures)
         if line.startswith('![') and '](images/' in line:
-            # Extract image info
             match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
             if match:
                 caption = match.group(1)
                 img_path = match.group(2)
-                # Categorize by type
                 if 'Table' in caption:
                     current_page['tables'].append({'caption': caption, 'path': img_path})
                 elif 'Chart' in caption:
@@ -163,18 +143,15 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
                 current_page['images'].append({'caption': caption, 'path': img_path})
-                # Add to full content with proper markdown formatting
                 current_page['full_content'].append(f"![{caption}]({img_path})")
-        # Regular content
         elif current_page:
-            if line:  # Only add non-empty lines
+            if line:
                 current_page['content'].append(line)
             current_page['full_content'].append(line)
         i += 1
-    # Add the last page
     if current_page:
         pages.append(current_page)
@@ -198,12 +175,9 @@ def run_full_parse(
     if not pdf_file:
         return ("No file provided.", None, [], [], "")
-    # Extract filename from the uploaded file path
-    # Gradio provides the original filename in the file path
     original_filename = Path(pdf_file).stem
     tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
-    # Use original filename for temp file so parser creates correct output directory
     input_pdf = tmp_dir / f"{original_filename}.pdf"
     shutil.copy2(pdf_file, input_pdf)
@@ -295,7 +269,6 @@ def run_extract(
     original_filename = Path(pdf_file).stem
     tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
-    # Use original filename for temp file so parser creates correct output directory
     input_pdf = tmp_dir / f"{original_filename}.pdf"
     shutil.copy2(pdf_file, input_pdf)

doctra/utils/progress.py CHANGED Viewed

@@ -40,7 +40,6 @@ def _detect_environment() -> Tuple[bool, bool, bool]:
     Returns (is_notebook, is_tty, is_windows).
     """
     is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
-    # Colab/Kaggle specifics
     if "google.colab" in sys.modules:
         is_notebook = True
     if "kaggle_secrets" in sys.modules or "kaggle_web_client" in sys.modules:
@@ -59,7 +58,6 @@ def _select_emoji(key: str) -> str:
       - ascii: ASCII text tokens
       - none: empty prefix
     """
-    # Maps
     default_map = {
         "loading": "🔄",
         "charts": "📊",
@@ -70,14 +68,13 @@ def _select_emoji(key: str) -> str:
         "processing": "⚙️",
     }
     safe_map = {
-        # Use BMP or geometric shapes likely to render everywhere
         "loading": "⏳",
         "charts": "▦",
         "tables": "▤",
         "figures": "▧",
         "ocr": "🔎",
         "vlm": "★",
-        "processing": "⚙",  # no variation selector
+        "processing": "⚙",
     }
     ascii_map = {
         "loading": "[loading]",
@@ -89,13 +86,11 @@ def _select_emoji(key: str) -> str:
         "processing": "[processing]",
     }
-    # Determine effective mode
     mode = _PROGRESS_CONFIG.emoji_mode
     is_notebook, _, is_windows = _detect_environment()
     if not _PROGRESS_CONFIG.use_emoji:
         mode = "none"
     elif mode == "default":
-        # Heuristics: prefer safe in Colab/Kaggle notebooks and Windows terminals
         if is_windows or "google.colab" in sys.modules or "kaggle_secrets" in sys.modules:
             mode = "safe"
@@ -105,7 +100,6 @@ def _select_emoji(key: str) -> str:
         return ascii_map.get(key, "")
     if mode == "safe":
         return safe_map.get(key, safe_map["processing"])
-    # default
     return default_map.get(key, default_map["processing"])
@@ -119,17 +113,13 @@ def _supports_unicode_output() -> bool:
     except Exception:
         pass
-    # Heuristics for common notebook environments that support emoji
     env = os.environ
     if any(k in env for k in ("COLAB_GPU", "GCE_METADATA_HOST", "KAGGLE_KERNEL_RUN_TYPE", "JPY_PARENT_PID")):
         return True
-    # On modern Windows terminals with UTF-8 code page, assume yes
     if sys.platform.startswith("win"):
-        # If user opted-in to force ASCII, respect it
         if _PROGRESS_CONFIG.force_ascii:
             return False
-        # Try to detect WT/Terminal/VSCode which usually handle Unicode
         if any(k in env for k in ("WT_SESSION", "TERM_PROGRAM", "VSCODE_PID")):
             return True
@@ -161,19 +151,15 @@ def create_beautiful_progress_bar(
     :return: Configured tqdm progress bar instance
     """
-    # Enhanced styling parameters - notebook-friendly format
     is_notebook, is_tty, is_windows = _detect_environment()
     if is_notebook:
-        # Simpler format for notebooks to avoid display issues
         bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
     else:
-        # Full format for terminal
         bar_format = (
             "{l_bar}{bar:30}| {n_fmt}/{total_fmt} "
             "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
         )
-    # Color schemes based on operation type
     color_schemes = {
         "loading": {"colour": "cyan", "ncols": 100},
         "charts": {"colour": "green", "ncols": 100},
@@ -184,7 +170,6 @@ def create_beautiful_progress_bar(
         "processing": {"colour": "white", "ncols": 100},
     }
-    # Determine color scheme based on description
     desc_lower = desc.lower()
     if "loading" in desc_lower or "model" in desc_lower:
         color_scheme = color_schemes["loading"]
@@ -201,45 +186,37 @@ def create_beautiful_progress_bar(
     else:
         color_scheme = color_schemes["processing"]
-    # Emoji categories
     emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
-    # Add appropriate emoji to description (can be disabled)
     if _PROGRESS_CONFIG.use_emoji:
         prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
         prefix = _select_emoji(prefix_key)
         if prefix:
             desc = f"{prefix} {desc}"
-    # Enhanced tqdm configuration
     tqdm_config = {
         "total": total,
         "desc": desc,
         "leave": leave,
         "bar_format": bar_format,
         "ncols": _PROGRESS_CONFIG.ncols_env or color_scheme["ncols"],
-        # Prefer Unicode unless user forces ASCII or environment lacks Unicode support
         "ascii": _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output(),
-        "dynamic_ncols": True,  # Responsive width
-        "smoothing": 0.3,  # Smooth progress updates
-        "mininterval": 0.1,  # Minimum update interval
-        "maxinterval": 1.0,  # Maximum update interval
+        "dynamic_ncols": True,
+        "smoothing": 0.3,
+        "mininterval": 0.1,
+        "maxinterval": 1.0,
         "position": position,
         **kwargs
     }
-    # Enhanced environment detection
     is_notebook, is_terminal, is_windows = _detect_environment()
-    # Add color only for terminal environments (not notebooks)
     if not is_notebook and is_terminal:
         tqdm_config["colour"] = color_scheme["colour"]
-    # Respect global disable
     if _PROGRESS_CONFIG.disable:
         tqdm_config["disable"] = True
-    # Try creating the progress bar with Unicode, fallback to ASCII on failure (e.g., Windows code page)
     if is_notebook:
         tqdm_config.pop("colour", None)
         try:
@@ -297,7 +274,6 @@ def update_progress_with_info(
     :param info: Optional dictionary of information to display
     """
     if info:
-        # Format info as postfix
         postfix_parts = []
         for key, value in info.items():
             if isinstance(value, float):
@@ -354,54 +330,22 @@ def create_notebook_friendly_bar(
     **kwargs
 ) -> tqdm:
     """
-    Create a notebook-friendly progress bar with minimal formatting.
+    Create a notebook-friendly progress bar with consistent sizing.
-    This function creates progress bars specifically optimized for Jupyter notebooks
-    to avoid display issues and ANSI code problems.
+    This function creates progress bars that match the main progress bar
+    styling and behavior in notebook environments.
     :param total: Total number of items to process
     :param desc: Description text for the progress bar
     :param kwargs: Additional tqdm parameters
     :return: Configured notebook-friendly progress bar
     """
-    # Force notebook mode
-    if _PROGRESS_CONFIG.disable:
-        kwargs["disable"] = True
-    else:
-        kwargs["disable"] = False
-    # Prefer Unicode in notebooks if supported
-    if "ascii" not in kwargs:
-        kwargs["ascii"] = _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output()
-    # Emoji categories
-    emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
-    # Add appropriate emoji to description
-    desc_lower = desc.lower()
-    if _PROGRESS_CONFIG.use_emoji:
-        prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
-        prefix = _select_emoji(prefix_key)
-        if prefix:
-            desc = f"{prefix} {desc}"
-    # Simple format for notebooks
-    bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
-    tqdm_config = {
-        "total": total,
-        "desc": desc,
-        "leave": True,
-        "bar_format": bar_format,
-        "ncols": _PROGRESS_CONFIG.ncols_env or 80,
-        "ascii": kwargs.get("ascii", False),
-        "dynamic_ncols": False,  # Fixed width for notebooks
-        "smoothing": 0.1,  # Faster updates
-        "mininterval": 0.05,
-        "maxinterval": 0.5,
+    return create_beautiful_progress_bar(
+        total=total,
+        desc=desc,
+        leave=True,
         **kwargs
-    }
-    return tqdm_auto(**tqdm_config)
+    )
 def progress_for(iterable: Iterable[Any], desc: str, total: Optional[int] = None, leave: bool = True, **kwargs) -> Iterator[Any]:

doctra/utils/structured_utils.py CHANGED Viewed

@@ -1,49 +1,45 @@
-from __future__ import annotations
-from typing import Any, Dict, Optional
-import json
-try:
-    from pydantic import BaseModel  # type: ignore
-except Exception:  # pydantic not strictly required for normalization
-    class BaseModel:  # fallback stub
-        pass
-def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
-    """
-    Accepts a VLM result that might be:
-      - JSON string
-      - dict
-      - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
-    Returns a normalized dict with keys: title, headers, rows — or None.
-    """
-    if obj is None:
-        return None
-    # JSON string from VLM
-    if isinstance(obj, str):
-        try:
-            obj = json.loads(obj)
-        except Exception:
-            return None
-    # Pydantic model
-    if isinstance(obj, BaseModel):
-        try:
-            return obj.model_dump()  # pydantic v2
-        except Exception:
-            try:
-                return obj.dict()    # pydantic v1
-            except Exception:
-                return None
-    # Plain dict
-    if isinstance(obj, dict):
-        title = obj.get("title") or "Untitled"
-        headers = obj.get("headers") or []
-        rows = obj.get("rows") or []
-        # Basic shape checks
-        if not isinstance(headers, list) or not isinstance(rows, list):
-            return None
-        return {"title": title, "headers": headers, "rows": rows}
-    return None
+from __future__ import annotations
+from typing import Any, Dict, Optional
+import json
+try:
+    from pydantic import BaseModel  # type: ignore
+except Exception:
+    class BaseModel:
+        pass
+def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
+    """
+    Accepts a VLM result that might be:
+      - JSON string
+      - dict
+      - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
+    Returns a normalized dict with keys: title, headers, rows — or None.
+    """
+    if obj is None:
+        return None
+    if isinstance(obj, str):
+        try:
+            obj = json.loads(obj)
+        except Exception:
+            return None
+    if isinstance(obj, BaseModel):
+        try:
+            return obj.model_dump()
+        except Exception:
+            try:
+                return obj.dict()
+            except Exception:
+                return None
+    if isinstance(obj, dict):
+        title = obj.get("title") or "Untitled"
+        headers = obj.get("headers") or []
+        rows = obj.get("rows") or []
+        if not isinstance(headers, list) or not isinstance(rows, list):
+            return None
+        return {"title": title, "headers": headers, "rows": rows}
+    return None

doctra/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
 """Version information for Doctra."""
-__version__ = '0.3.1'
+__version__ = '0.3.3'

{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: doctra
-Version: 0.3.1
+Version: 0.3.3
 Summary: Parse, extract, and analyze documents with ease
 Home-page: https://github.com/AdemBoukhris457/Doctra
 Author: Adem Boukhris

{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 doctra/__init__.py,sha256=ST_c2GWBoB0y_wpL1qsOeK4bR1RyJhMMn6I5VjVRI6Y,613
-doctra/version.py,sha256=BDWZqR8pRPnlsqLDR4Kx91MC6A9OwylJHhHemdaa6DQ,60
+doctra/version.py,sha256=-8CkxAWlU-OCRJP3Yq9OGjh-4nS4-sU-LRjZ28K6oUw,62
 doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
 doctra/cli/main.py,sha256=o_W1b5kx3xaTbWK6l4IYi0YLwffKBj5pQKflnlaG2Fw,35611
 doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
@@ -14,7 +14,7 @@ doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMY
 doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
 doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
-doctra/engines/vlm/service.py,sha256=Jwws2Jw68-IdHyvEWks4UCoP7Olhqt8IpXfCv5Z7Ml4,4724
+doctra/engines/vlm/service.py,sha256=4ExDbLmyyC3ICXxr7OSIqvbOdrwbIJek-DE54vAUgDA,4151
 doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
 doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
@@ -23,10 +23,10 @@ doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r
 doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
 doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
 doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
-doctra/parsers/structured_pdf_parser.py,sha256=fbDIQ6VFv1phFPC3lKgcjtCp0AdNA8Ny1dK0F726Pww,21357
-doctra/parsers/table_chart_extractor.py,sha256=JuoScqCQbPdQjy4ak77OcZHSPYKGHF4H39fEW6gF3eo,15323
+doctra/parsers/structured_pdf_parser.py,sha256=QIZIS5SAaIdGiT8o7G_a4D-Cht7nVLGeSuVzqSYLn14,19160
+doctra/parsers/table_chart_extractor.py,sha256=kSubqX0n0kVu_3jzX6QUyKmEGs9sG3Bg9kzUzn2wPHo,13733
 doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
-doctra/ui/app.py,sha256=FYDlEG_2pfp7SSHnA04NRNUhOcI-BJPh3qAf5dw5D6g,45903
+doctra/ui/app.py,sha256=WpXUWHSs7wSYNjY4iBOZJHsKGQ88jDytvOFIjuhqAGE,44031
 doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
 doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
@@ -34,11 +34,11 @@ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
 doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
 doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
 doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
-doctra/utils/progress.py,sha256=sNEjTdN32J1-eXFPqwZRw2EZQ1SXSesXBd5StJvtlmc,14481
+doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
 doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
-doctra/utils/structured_utils.py,sha256=J-qTqo8eCjm36FaRJ_I482LFgYCpm3eukZm-gbNnchw,1401
-doctra-0.3.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-doctra-0.3.1.dist-info/METADATA,sha256=2-2aMiNRvofe2WYuYejI6NqSkVctiH5SLK-EX4nIjaE,28298
-doctra-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-doctra-0.3.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
-doctra-0.3.1.dist-info/RECORD,,
+doctra/utils/structured_utils.py,sha256=znC2zr80rZMfIV58lipZ8M4zPq6IF070pdwLBve1qiE,1251
+doctra-0.3.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+doctra-0.3.3.dist-info/METADATA,sha256=GX4AvDkmBPFcmt0drF84Wy2WuiqB0ivNw_7bMEpHuMc,28298
+doctra-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+doctra-0.3.3.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
+doctra-0.3.3.dist-info/RECORD,,

{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

doctra 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

doctra 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl